# This file contains code to analyze the tripartite network of donors, candidates and agendas.
# set the working directory
setwd("/home/bruce/Dropbox/Camp")

t <- 2000
time <- 1
saveFile <- paste("/home/bruce/Dropbox/Camp/Data/ClusteringResults",t,".RData",sep="")

pacDat <- read.csv(paste("/home/bruce/Dropbox/Camp/Data/PAC contributions/",t," PAC to Cand.csv",sep=""), header=F, stringsAsFactors=F)
colnames(pacDat) <- c("Cycle","FECRecNo","PACID","CID","Amount","Date","RealCode","Type","DI","FECCandID")[1:ncol(pacDat)]
candDat <- read.csv(paste("/home/bruce/Dropbox/Camp/Data/Candidates/",t," Candidates.csv",sep=""), header=F, stringsAsFactors=F)
colnames(candDat) <- c("Cycle","FECCandID","CID","FirstLastP","Party","DistIDRunFor","DistIDCurr","CurrCand","CycleCand","CRPICO","RecipCode","NoPacs")
commDat <- read.csv(paste("/home/bruce/Dropbox/Camp/Data/Committees/committees",t,".csv",sep=""), header=F, stringsAsFactors=F)
colnames(commDat) <- c("Cycle","CmteID","PACShort","Affiliate","Ultorg","RecipID","RecipCode","FECCandID","Party","PrimCode","Source","Sensitive","Foreign","Active")

### Description of committee (PAC) variables

# Cycle		Last year (even year) of the federal 2-year election cycle
# CmteID 		Unique ID given by FEC to each committee.
# PACShort 	Standardized committee name based on PAC's sponsor.	
# Affiliate 	Usually blank. For leadpacs, shows the sponsoring member.
# Ultorg 		The standardized parent organization for the organization listed in the PACShort field. If there is no parent identified, this field will be equal to PACShort.
# RecipID		For candidate committees this will be the candidate's CID. Otherwise, it will be the same as CmteID.
# RecipCode	A two-character code defining the type of recipient. For candidates, the first character is party ("D" for Democratic, "R" for Republican, "3" for Independent or third party, "U" for Unknown.) The second character is "W" for Winner, "L" for Loser, "I" for incumbent, "C" for Challenger, "O" for "Open Seat", and "N" for Non-incumbent. "N" is reserved for candidates that are neither in office nor running during the cycle in question. For party committees, the first character is party and the second character is "P." For PACs, the first character is "P" and the second character is "B" for Business, "L" for Labor", "I" for Ideological, "O" for "Other" and "U" for unknown.
# FECCandID	Unique ID given to candidates by FEC.
# Party		(D,R,3,I,L) Will be null or empty if committee is not a party, joint fundraising, leadership or candidate committee.
# PrimCode	The standard five character code identifying the committee's industry or ideology.
# Source		Indicates how the PrimCode was determined.
# Sensitive	If "Y", the committee has significant business in multiple industries, some of which fall under the jurisdiction of specific congressional committees.
# Foreign		This is a bit field. Off/False indicate that the company is not owned by a foreign entity. Those that are owned by a foreign entity are on/True, sometimes "-1".
# Active		Determines if cmte is active in the cycle - 0 is no and 1 is yes

### Description of PAC contribution variables

# Cycle		Last year (even year) of the federal 2-year election cycle
# FECRecNo	A unique record identifier within a given cycle.
# PACID		The committee id number for the PAC making the contribution.
# CID		A unique identifier for candidates that is constant throughout cycles.
# Amount	The amount contributed. This will be negative for refunds.
# Date		The reported date of the contribution.
# RealCode	The standard five character code identifying the donor's industry or ideology.  Usually based on Primcode. Sometimes a PAC sponsor will have secondary interests which may replace the main realcode depending on recipient. For example, Boeing is primarily Air Transport but has Air Defense interests. Thus Boeing contributions to members of the Armed Services committee would have a realcode of Air Defense.
# Type		The transaction type code for the contribution. 24A is an Independent Expenditure against the candidate, 24C is a coordinated expenditure, 24E is an independent expenditure for the candidate, 24F is a communication cost for the candidate, 24K is a direct contribution, 24N is a communication cost against the candidate and 24Z is an in kind contribution
# DI		Whether the contribution is direct ("D") or indirect ("I."). Indirect contributions include independent expenditures and communications costs, are not subject to contribution limits and must be made completely independently of the candidate. Indirect contributions can also be against the candidate.
# FECCandID	FECCandID of recipient candidate

### Description of Candidate Variables

# Cycle		Last year (even year) of the federal two year election cycle
# FECCandID	Assigned by FEC and selected by CRP as the active, should multiples exist.
# CID		Unique identifier for each candidate. Every candidate should have one and only one CID throughout all cycles. Recipid for candidates is based on CID.
# FirstLastP	Candidate name in format of firstname lastname and party in parens, like Steve Kagen (D)
# Party		The party of the candidate. "D" for Democratic, "R" for Republican", "I" for Independent, "L" for Libertarian", "3" for other third party and "U" for Unknown.
# DistIDRunFor	Four character identifier of the office sought by the candidate. For congressional races, the first two characters are the state and the next two are the district for House candidates and "S1" or "S2" for Senate 			candidates. "PRES" indicates a presidential candidate.
# DistIDCurr	Four character identifier of the office currently held (if any) by the candidate. For House members, the first two characters are the state and the next two are the district. For Senators the first two characters are 			the state and the last two characters are "S1" or "S2". "PRES" indicates a presidential candidate. For non-incumbents, this field is blank. If a member of Congress dies or leaves office, this field should become blank. 			This field is frozen on election day. For cycles prior to the current cycle, DistidCurr reflects office held on Election Day of the Cycle.
# CurrCand	This field indicates whether the candidate is currently running for federal office - "Y" means yes, otherwise this field is blank. If a candidate loses a primary or drops out of the race, this field becomes blank. This 			field is frozen on Election Day, and thus for previous cycles can be used to show the candidate who ran in the general election.
# CycleCand	This field indicates whether the candidate ever ran for federal office during the cycle in question. Like CurrCand, "Y" means yes and blank means no. This field should be "Y" for any candidate who filed to run for 			office or otherwise formally declared intention to run. This does NOT change if the candidate drops out or loses a primary. Be aware that we've tightened the definition in recent cycles - for older data, CycleCand is 			likely to cast a broader net. Also note that incumbents are usually assumed to be running for re-election and get a "Y" in CycleCand unless there is evidence to the contrary.
# CRPICO	Identifies type of candidate - "I" is incumbent, "C" is challenger, "O" is open seat. This may be blank if the candidate is neither a member of Congress nor running this cycle. Note this is based on the office sought. 			A House incumbent running for the Senate would have a CRPICO of "C" or "O", not "I."
# RecipCode	A two-character code defining the type of candidate. The first character is party ("D" for Democratic, "R" for Republican, "3" for Independent or third party, "U" for Unknown.) The second character is "W" for Winner, 			"L" for Loser, "I" for incumbent, "C" for Challenger, "O" for "Open Seat", and "N" for Non-incumbent. Incumbent, Challenger and Open Seat are based on CRPICO. "N" is reserved for candidates that are neither in office 			nor running during the cycle in question. This lives in dbo_CandsCRP.
# NoPacs	Indicates whether candidate has publicly committed to forego contributions from PACs

library(flexmix)


## Use only contributions
# pacDat <- pacDat[pacDat$Amount >0,]
# pacDat <- pacDat[!is.na(pacDat$CID),]


## Remove non-house candidates
dist2 <- substr(candDat$DistIDRunFor,3,4)
candDat <- candDat[!is.element(dist2,c("ES","S1","S2")),]

## Add incumbency information to candidate name
candDat$FirstLastP <- paste(candDat$CRPICO,candDat$RecipCode,candDat$FirstLastP,sep="")

## Add Candidate name to PAC data
pacDat$CName <- candDat$FirstLastP[match(pacDat$CID,candDat$CID)]
pacDat$CName <- gsub(" ", "", pacDat$CName)
pacDat$CName <- gsub('[[:punct:]]','',pacDat$CName)
pacDat <- pacDat[!is.na(pacDat$CName),]

## Add District Information
pacDat$District <- candDat$DistIDRunFor[match(pacDat$CID,candDat$CID)]

## Recode Contributions Against
# If there's an incumbent of the opposite party running for the same district, give him/her the $$
# If there is no incumbent, divide it equally among the other-party candidates
party <- substr(candDat$FirstLastP,nchar(candDat$FirstLastP)-1,nchar(candDat$FirstLastP)-1)
incumbent <- candDat$CRPICO
district <- candDat$DistIDRunFor
type <- pacDat$Type
against <- which(is.element(type,c("24A","24N")))
for(i in 1:length(against)){
	indi <- against[i]
	candAgainst <- pacDat$CName[indi]
	partyAgainst <- substr(candAgainst,nchar(candAgainst),nchar(candAgainst))
	dist <- pacDat$District[indi]

	others <- which(district == dist & party != partyAgainst)
	if(length(others)==0) pacDat$Amount[indi] <- 0
	if(length(others)>0){
		for(j in 1:length(others)){
			cidj <- candDat$CID[others[j]]
			transID <- which(pacDat$PACID==pacDat$PACID[indi] & pacDat$CID==cidj)
			if(length(transID)>0 & !any(pacDat$CID[against]==cidj & pacDat$PACID[against]==pacDat$PACID[indi])){
				pacDat$Amount[transID[1]] <- pacDat$Amount[transID[1]] + pacDat$Amount[indi]
				print("r")
			}
		}
	}
	pacDat$Amount[indi] <- 0
}

##### COPY TO HERE MIKE
		
uName <- unique(pacDat$CName)
uPAC <- unique(pacDat$PACID)

amat <- matrix(0,length(uPAC),length(uName))
colnames(amat) <- uName
rownames(amat) <- uPAC
for(i in 1:nrow(pacDat)){
	paci <- pacDat$PACID[i]
	candi <- pacDat$CName[i]
	r <- which(uPAC==paci)
	c <- which(uName==candi)
	amat[r,c] <- amat[r,c] + pacDat$Amount[i]
	if(round(i/100000)==i/100000) print(i)
}

### Subset down to candidates who received more than 5k and pacs that gave at least 5k
# ind <- which(apply(amat,2,sum) > 5000)
# amat <- amat[,ind]
# Try out clustering candidates
# amat <- t(amat)
# ind <- which(apply(amat,2,sum) > 0)
# amat <- amat[,ind]

### Recode negative edges as 0
for(i in 1:nrow(amat)){
	negind <- which(amat[i,] < 0)
	amat[i,negind] <- 0
}


### FlexMix ###
#### Using Count (amount donated) #####
estims <- list()
BICs <- rep(Inf,50)
for(k in 5:50){
start <- kmeans(log(amat+1),k)
init.clust <- start$cluster
kmweight <- 1.5
prior <- matrix(1,nrow(amat),k)
for(i in 1:nrow(prior)){
 	prior[i,init.clust[i]] <- kmweight
}
prior <- prior/apply(prior,1,sum)

flex1 <- flexmix(amat~1,cluster=prior,model=FLXMCmvpois(),control=list(tol=1e-50, minprior=0.01))
estims[[k]] <- flex1
BICs[k] <- BIC(flex1)
print(c(k,BIC(flex1)))
}

flex1 <- estims[[which.min(BICs)]]

party <- substr(rownames(amat),nchar(rownames(amat)),nchar(rownames(amat)))
incumbent <- substr(rownames(amat),1,1)
cluster <- attributes(flex1)$cluster

clustPois <- list(summaryMat = cbind(rownames(amat),party,incumbent,cluster,apply(amat,1,sum)),estimate= estims, bic=BICs)

save(clustPois,file=saveFile)













